The violent crime rate in U.S increased by 3.4 percent nationwide in 2016 in US. As an international student, as well as a New Yorker, the public safety in NYC is always a concern to us, especially after the recent terrorists attack near the World Trade Center. Thus, our group decided to make a deeper investigation of the crime data and seek out some underlying reasons which led to the increase of crime rate.
The New York City Police Department provides overall crime dataset. NYPD also established a CompStat model, called CompStat 2.0, providing greater specificity about crimes through an online interactive experience.\ On the official website of new york city, there is also a Crime Map which enables people to view crime by precinct. This map includes crimes of seven major felonies.
Since the dataset has 341716, 9, we randomly sample 50000 observations and creat an interactive map showing locations where the crimes in New York City occured:
sample <- nyc_crime_2017[sample(1:nrow(nyc_crime_2017), 50000,replace=FALSE),]
sample %>%
mutate(text_label = str_c("Offense desc:", ofns_desc, ' Boro: ', boro)) %>%
plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
alpha = 0.5,
color = ~ofns_type,
text = ~text_label)
## Warning: Ignoring 710 observations
Bar chart showing crime number and offense type in different boro:
barplot = nyc_crime_2017 %>%
mutate(boro = fct_infreq(boro)) %>%
ggplot(aes(x = boro, fill = ofns_type)) + geom_bar()
ggplotly(barplot)
*Collect historic data of crimes
nyc_hist_vio = read_excel("./historic/violation-offenses-2000-2016.xls", range = "A4:R6") %>%
mutate(ofns_type = "VIOLATION")
nyc_hist_felony_7 = read_excel("./historic/seven-major-felony-offenses-2000-2016.xls", range = "A5:R12") %>%
mutate(ofns_type = "FELONY")
nyc_hist_felony = read_excel("./historic/non-seven-major-felony-offenses-2000-2016.xls", range = "A5:R13") %>%
mutate(ofns_type = "FELONY")
nyc_hist_mis = read_excel("./historic/misdemeanor-offenses-2000-2016.xls", range = "A4:R21")%>%
mutate(ofns_type = "MISDEMEANOR")
We combine the information of crimes in past 16 years.
nyc_crime_hist = nyc_hist_mis %>%
full_join(nyc_hist_felony) %>%
full_join(nyc_hist_felony_7) %>%
full_join(nyc_hist_vio) %>%
mutate(ofns_type = as.factor(ofns_type), ofns_desc = OFFENSE) %>%
select(-OFFENSE)
nyc_crime_hist = nyc_crime_hist %>%
gather(key = year, value = count, "2000":"2016") %>%
group_by(year, ofns_type) %>%
summarize(n = sum(count)/12) %>%
full_join(nyc_crime_2017 %>%
group_by(ofns_type) %>%
summarize(n = n()/10) %>%
mutate(year = "2017")) %>%
ungroup()
## Warning: Column `ofns_type` joining factor and character vector, coercing
## into character vector
nyc_crime_hist %>%
mutate(year = as.numeric(year)) %>%
ggplot(aes(x = year, y = n, fill = ofns_type)) + geom_bar(stat = "identity")
We then focused on crime data of current year.
Make a plot of crime count versus hour in a day and group by boro.
nyc_crime_2017 %>%
mutate(hour = hour(time)) %>%
group_by(hour, boro) %>%
summarize(n = n()) %>%
ggplot(aes(x = hour, y = n, color = boro)) + geom_point(alpha = 0.5) + geom_line()
Make a crime rate plot based on 2017 data
crime_tidy = nyc_crime_2017 %>%
separate(date, into = c("year", "month","day"), sep = "-") %>%
select(-year, -day) %>%
group_by(month,boro) %>%
summarize(crime_count = n())
crimetotal = ggplot(crime_tidy, aes(x = month, y = crime_count, color = boro)) +
geom_point() + geom_path(aes(group = boro)) +
theme(legend.position = "bottom")
crime_rate = crime_tidy %>%
mutate(popluation = recode(boro, "BRONX" = 1455720,
"BROOKLYN" = 2629150,
"MANHATTAN" = 1643734,
"QUEENS" = 2333054,
"STATEN ISLAND" = 476015)) %>%
mutate(crime_rate = (crime_count/popluation)*100000)
crimerate = ggplot(crime_rate, aes(x = month, y = crime_rate, color = boro)) +
geom_point() + geom_path(aes(group = boro)) +
theme(legend.position = "bottom")
library(gridExtra)
grid.arrange(crimetotal, crimerate, ncol = 2)
The count of different type of crimes based on 2017 data
crime_tidy2 = nyc_crime_2017 %>%
group_by(date, ofns_type) %>%
summarize(crime_count = n())
ggplot(crime_tidy2, aes(x = date, y = crime_count, color = ofns_type)) +
geom_point(alpha = .6) + geom_smooth() +
theme(legend.position = "bottom")
Top 10 words in of offense description:
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.2
crime_words = nyc_crime_2017 %>%
select(-longitude, -latitude) %>%
mutate(ofns_desc = str_to_lower(ofns_desc),
ofns_desc = as.character(ofns_desc)) %>%
unnest_tokens(word, ofns_desc)
data(stop_words)
crime_word_tidy =
anti_join(crime_words, stop_words)
crime_word_tidy %>%
count(word, sort = TRUE) %>%
top_n(10) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_bar(stat = "identity", fill = "blue", alpha = .6) +
coord_flip()
Comparison of distinct words in offense type of violation and felony.
word_ratios = crime_word_tidy %>%
filter(ofns_type %in% c("VIOLATION" , "FELONY")) %>%
count(word, ofns_type) %>%
group_by(word) %>%
filter(sum(n) >= 5) %>%
ungroup() %>%
spread(ofns_type, n, fill = 0) %>%
mutate(
violation_odds = (VIOLATION + 1) / (sum(VIOLATION) + 1),
felony_odds = (FELONY + 1) / (sum(FELONY) + 1),
log_OR = log(felony_odds / violation_odds)
) %>%
arrange(desc(log_OR))
word_ratios %>%
mutate(pos_log_OR = ifelse(log_OR > 0, "felony_odds >violation_odds" ,"violation_odds > felony_odds")) %>%
group_by(pos_log_OR) %>%
top_n(10, abs(log_OR)) %>%
ungroup() %>%
mutate(word = fct_reorder(word, log_OR)) %>%
ggplot(aes(word, log_OR, fill = pos_log_OR)) +
geom_col() +
coord_flip() +
ylab("log odds ratio (felony_odds/violation_odds)") +
scale_fill_discrete(name = "") +
theme(legend.position = "bottom")